Read in the data


In [2]:
import pandas
import numpy
import re

data_files = [
    "ap_2010.csv",
    "class_size.csv",
    "demographics.csv",
    "graduation.csv",
    "hs_directory.csv",
    "sat_results.csv"
]

data = {}

for f in data_files:
    d = pandas.read_csv("schools/{0}".format(f))
    data[f.replace(".csv", "")] = d

Read in the surveys


In [3]:
all_survey = pandas.read_csv("schools/survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pandas.read_csv("schools/survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pandas.concat([all_survey, d75_survey], axis=0)

survey["DBN"] = survey["dbn"]

survey_fields = [
    "DBN", 
    "rr_s", 
    "rr_t", 
    "rr_p", 
    "N_s", 
    "N_t", 
    "N_p", 
    "saf_p_11", 
    "com_p_11", 
    "eng_p_11", 
    "aca_p_11", 
    "saf_t_11", 
    "com_t_11", 
    "eng_t_10", 
    "aca_t_11", 
    "saf_s_11", 
    "com_s_11", 
    "eng_s_11", 
    "aca_s_11", 
    "saf_tot_11", 
    "com_tot_11", 
    "eng_tot_11", 
    "aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey

Add DBN columns


In [4]:
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]

def pad_csd(num):
    string_representation = str(num)
    if len(string_representation) > 1:
        return string_representation
    else:
        return "0" + string_representation
    
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]

Convert columns to numeric


In [5]:
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
    data["sat_results"][c] = pandas.to_numeric(data["sat_results"][c], errors="coerce")

data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]

def find_lat(loc):
    coords = re.findall("\(.+, .+\)", loc)
    lat = coords[0].split(",")[0].replace("(", "")
    return lat

def find_lon(loc):
    coords = re.findall("\(.+, .+\)", loc)
    lon = coords[0].split(",")[1].replace(")", "").strip()
    return lon

data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)

data["hs_directory"]["lat"] = pandas.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pandas.to_numeric(data["hs_directory"]["lon"], errors="coerce")

Condense datasets


In [6]:
class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]

class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size

data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]

data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]

Convert AP scores to numeric


In [7]:
cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']

for col in cols:
    data["ap_2010"][col] = pandas.to_numeric(data["ap_2010"][col], errors="coerce")

Combine the datasets


In [8]:
combined = data["sat_results"]

combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")

to_merge = ["class_size", "demographics", "survey", "hs_directory"]

for m in to_merge:
    combined = combined.merge(data[m], on="DBN", how="inner")

combined = combined.fillna(combined.mean())
combined = combined.fillna(0)

Add a school district column for mapping


In [9]:
def get_first_two_chars(dbn):
    return dbn[0:2]

combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)

Find correlations


In [10]:
correlations = combined.corr()
correlations = correlations["sat_score"]
print(correlations)


SAT Critical Reading Avg. Score         0.986820
SAT Math Avg. Score                     0.972643
SAT Writing Avg. Score                  0.987771
sat_score                               1.000000
AP Test Takers                          0.523140
Total Exams Taken                       0.514333
Number of Exams with scores 3 4 or 5    0.463245
Total Cohort                            0.325144
CSD                                     0.042948
NUMBER OF STUDENTS / SEATS FILLED       0.394626
NUMBER OF SECTIONS                      0.362673
AVERAGE CLASS SIZE                      0.381014
SIZE OF SMALLEST CLASS                  0.249949
SIZE OF LARGEST CLASS                   0.314434
SCHOOLWIDE PUPIL-TEACHER RATIO               NaN
schoolyear                                   NaN
fl_percent                                   NaN
frl_percent                            -0.722225
total_enrollment                        0.367857
ell_num                                -0.153778
ell_percent                            -0.398750
sped_num                                0.034933
sped_percent                           -0.448170
asian_num                               0.475445
asian_per                               0.570730
black_num                               0.027979
black_per                              -0.284139
hispanic_num                            0.025744
hispanic_per                           -0.396985
white_num                               0.449559
                                          ...   
rr_p                                    0.047925
N_s                                     0.423463
N_t                                     0.291463
N_p                                     0.421530
saf_p_11                                0.122913
com_p_11                               -0.115073
eng_p_11                                0.020254
aca_p_11                                0.035155
saf_t_11                                0.313810
com_t_11                                0.082419
eng_t_10                                     NaN
aca_t_11                                0.132348
saf_s_11                                0.337639
com_s_11                                0.187370
eng_s_11                                0.213822
aca_s_11                                0.339435
saf_tot_11                              0.318753
com_tot_11                              0.077310
eng_tot_11                              0.100102
aca_tot_11                              0.190966
grade_span_max                               NaN
expgrade_span_max                            NaN
zip                                    -0.063977
total_students                          0.407827
number_programs                         0.117012
priority08                                   NaN
priority09                                   NaN
priority10                                   NaN
lat                                    -0.121029
lon                                    -0.132222
Name: sat_score, dtype: float64

In [11]:
survey_fields


Out[11]:
['DBN',
 'rr_s',
 'rr_t',
 'rr_p',
 'N_s',
 'N_t',
 'N_p',
 'saf_p_11',
 'com_p_11',
 'eng_p_11',
 'aca_p_11',
 'saf_t_11',
 'com_t_11',
 'eng_t_10',
 'aca_t_11',
 'saf_s_11',
 'com_s_11',
 'eng_s_11',
 'aca_s_11',
 'saf_tot_11',
 'com_tot_11',
 'eng_tot_11',
 'aca_tot_11']

Plotting survey correlations


In [12]:
%matplotlib inline
correlations[survey_fields].plot.bar(figsize=(12,6))


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f000948d3c8>

Conclusions

There are high correlations between N_s, N_t, N_p and sat_score. Since these columns are correlated with total_enrollment, it makes sense that they would be high. It is more interesting that rr_s, the student response rate, or the percentage of students that completed the survey, correlates with sat_score. This might make sense because students who are more likely to fill out surveys may be more likely to also be doing well academically. How students and teachers percieved safety (saf_t_11 and saf_s_11) correlate with sat_score. This make sense, as it's hard to teach or learn in an unsafe environment. The last interesting correlation is the aca_s_11, which indicates how the student perceives academic standards, correlates with sat_score, but this is not true for aca_t_11, how teachers perceive academic standards, or aca_p_11, how parents perceive academic standards.


In [13]:
combined.plot.scatter("saf_s_11", "sat_score")


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f000946c710>

There is not strong correlation between SAT scores and safety. Maybe a few schools have high SAT scores and high safety scores. But also there are

Map out safety scores


In [ ]:


In [14]:
district_safety = combined.groupby("school_dist").agg(numpy.mean)
district_safety.reset_index(inplace=True)
district_safety.head()


Out[14]:
school_dist SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score sat_score AP Test Takers Total Exams Taken Number of Exams with scores 3 4 or 5 Total Cohort CSD ... grade_span_max expgrade_span_max zip total_students number_programs priority08 priority09 priority10 lat lon
0 01 441.833333 473.333333 439.333333 1354.500000 116.681090 173.019231 135.800000 93.500000 1.0 ... 12.0 12.0 10003.166667 659.500000 1.333333 0.0 0.0 0.0 40.719022 -73.982377
1 02 426.619092 444.186256 424.832836 1295.638184 128.908454 201.516827 157.495833 158.647849 2.0 ... 12.0 12.0 10023.770833 621.395833 1.416667 0.0 0.0 0.0 40.739699 -73.991386
2 03 428.529851 437.997512 426.915672 1293.443035 156.183494 244.522436 193.087500 183.384409 3.0 ... 12.0 12.0 10023.750000 717.916667 2.000000 0.0 0.0 0.0 40.781574 -73.977370
3 04 402.142857 416.285714 405.714286 1224.142857 129.016484 183.879121 151.035714 113.857143 4.0 ... 12.0 12.0 10029.857143 580.857143 1.142857 0.0 0.0 0.0 40.793449 -73.943215
4 05 427.159915 438.236674 419.666098 1285.062687 85.722527 115.725275 142.464286 143.677419 5.0 ... 12.0 12.0 10030.142857 609.857143 1.142857 0.0 0.0 0.0 40.817077 -73.949251

5 rows × 68 columns


In [15]:
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
m = Basemap(
    projection='merc', 
    llcrnrlat=40.496044, 
    urcrnrlat=40.915256, 
    llcrnrlon=-74.255735, 
    urcrnrlon=-73.700272,
    resolution='i'
)
m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)
m.fillcontinents(color='white',lake_color='#85A6D9')
longitudes = district_safety['lon'].tolist()
latitudes = district_safety['lat'].tolist()

m.scatter(longitudes, latitudes, s=50, zorder=2, latlon=True, c=district_safety['saf_s_11'], cmap='summer')
    
plt.show()


It looks like Upper Manhattan and parts of Queens and the Bronx tend to have lower safety scores, whereas Brooklyn has high safety scores.


In [16]:
races = [
    'white_per',
    'asian_per',
    'black_per',
    'hispanic_per']
correlations[races].plot.bar()


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f00094c5d30>

It looks like a higher percentage of white or asian students at a school correlates positively with sat score, whereas a higher percentage of black or hispanic students correlates negatively with sat score. This may be due to a lack of funding for schools in certain areas, which are more likely to have a higher percentage of black or hispanic students.


In [17]:
combined.plot.scatter("hispanic_per", "sat_score")


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f00094b3240>

SAT score > 1500 is only for schools where percent of hispanic not more than 40%. If school has 100% of hispanic - SAT score not more than 1100


In [18]:
full_hispanic = combined[combined['hispanic_per'] > 95]
full_hispanic


Out[18]:
DBN SCHOOL NAME Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score sat_score SchoolName AP Test Takers Total Exams Taken ... priority05 priority06 priority07 priority08 priority09 priority10 Location 1 lat lon school_dist
44 02M542 MANHATTAN BRIDGES HIGH SCHOOL 66 336.0 378.0 344.0 1058.0 Manhattan Bridges High School 67.000000 102.000000 ... 0 0 0 0 0 0 525 West 50Th Street\nNew York, NY 10019\n(40.... 40.765027 -73.992517 02
82 06M348 WASHINGTON HEIGHTS EXPEDITIONARY LEARNING SCHOOL 70 380.0 395.0 399.0 1174.0 0 129.028846 197.038462 ... Then to New York City residents 0 0 0 0 0 511 West 182Nd Street\nNew York, NY 10033\n(40... 40.848879 -73.930807 06
89 06M552 GREGORIO LUPERON HIGH SCHOOL FOR SCIENCE AND M... 56 339.0 349.0 326.0 1014.0 GREGORIO LUPERON HS SCI & MATH 88.000000 138.000000 ... 0 0 0 0 0 0 501 West 165Th\nNew York, NY 10032\n(40.838032... 40.838032 -73.938371 06
125 09X365 ACADEMY FOR LANGUAGE AND TECHNOLOGY 54 315.0 339.0 297.0 951.0 Academy for Language and Technology 20.000000 20.000000 ... 0 0 0 0 0 0 1700 Macombs Road\nBronx, NY 10453\n(40.849102... 40.849102 -73.916088 09
141 10X342 INTERNATIONAL SCHOOL FOR LIBERAL ARTS 49 300.0 333.0 301.0 934.0 International School for Liberal Arts 55.000000 73.000000 ... 0 0 0 0 0 0 2780 Reservoir Avenue\nBronx, NY 10468\n(40.87... 40.870377 -73.898163 10
176 12X388 PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE 30 321.0 351.0 298.0 970.0 0 129.028846 197.038462 ... 0 0 0 0 0 0 1300 Boynton Avenue\nBronx, NY 10472\n(40.8313... 40.831366 -73.878823 12
253 19K583 MULTICULTURAL HIGH SCHOOL 29 279.0 322.0 286.0 887.0 Multicultural High School 44.000000 44.000000 ... 0 0 0 0 0 0 999 Jamaica Avenue\nBrooklyn, NY 11208\n(40.69... 40.691144 -73.868426 19
286 24Q296 PAN AMERICAN INTERNATIONAL HIGH SCHOOL 55 317.0 323.0 311.0 951.0 0 129.028846 197.038462 ... 0 0 0 0 0 0 45-10 94Th Street\nElmhurst, NY 11373\n(40.743... 40.743303 -73.870575 24

8 rows × 160 columns


In [19]:
m = Basemap(
    projection='merc', 
    llcrnrlat=40.496044, 
    urcrnrlat=40.915256, 
    llcrnrlon=-74.255735, 
    urcrnrlon=-73.700272,
    resolution='i'
)
m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)
m.fillcontinents(color='white',lake_color='#85A6D9')


longitudes = full_hispanic['lon'].tolist()
latitudes = full_hispanic['lat'].tolist()

m.scatter(longitudes, latitudes, s=50, zorder=2, latlon=True, c=full_hispanic['sat_score'], cmap='summer')
    
plt.show()


The schools listed above appear to primarily be geared towards recent immigrants to the US. These schools have a lot of students who are learning English, which would explain the lower SAT scores.


In [20]:
less_hispanic = combined[(combined['hispanic_per'] < 10) & (combined['sat_score'] > 1800)]
less_hispanic


Out[20]:
DBN SCHOOL NAME Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score sat_score SchoolName AP Test Takers Total Exams Taken ... priority05 priority06 priority07 priority08 priority09 priority10 Location 1 lat lon school_dist
37 02M475 STUYVESANT HIGH SCHOOL 832 679.0 735.0 682.0 2096.0 STUYVESANT HS 1510.0 2819.0 ... 0 0 0 0 0 0 345 Chambers Street\nNew York, NY 10282\n(40.7... 40.717746 -74.014049 02
151 10X445 BRONX HIGH SCHOOL OF SCIENCE 731 632.0 688.0 649.0 1969.0 BRONX HS OF SCIENCE 1190.0 2435.0 ... 0 0 0 0 0 0 75 West 205 Street\nBronx, NY 10468\n(40.87705... 40.877056 -73.889780 10
187 13K430 BROOKLYN TECHNICAL HIGH SCHOOL 1277 587.0 659.0 587.0 1833.0 BROOKLYN TECHNICAL HS 2117.0 3692.0 ... 0 0 0 0 0 0 29 Ft Greene Place\nBrooklyn, NY 11217\n(40.68... 40.688107 -73.976745 13
327 28Q687 QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO... 121 612.0 660.0 596.0 1868.0 Queens HS for Science York Colllege 215.0 338.0 ... 0 0 0 0 0 0 94-50 159 Street\nJamaica, NY 11433\n(40.70099... 40.700999 -73.798154 28
356 31R605 STATEN ISLAND TECHNICAL HIGH SCHOOL 227 635.0 682.0 636.0 1953.0 STATEN ISLAND TECHNICAL HS 528.0 905.0 ... 0 0 0 0 0 0 485 Clawson Street\nStaten Island, NY 10306\n(... 40.567913 -74.115362 31

5 rows × 160 columns

Many of the schools above appear to be specialized science and technology schools that receive extra funding, and only admit students who pass an entrance exam. This doesn't explain the low hispanic_per, but it does explain why their students tend to do better on the SAT -- they are students from all over New York City who did well on a standardized test.


In [21]:
correlations[['male_per', 'female_per']].plot.bar()


Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f00093c2ac8>

There is little correlation: negative for male and positive for female. But its not strong.


In [22]:
combined.plot.scatter("female_per", "sat_score")
combined.plot.scatter("male_per", "sat_score")


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f000930e278>

No obvious correlation. But it is noticeable, that on the both plots marks are similar for SAT score > 1600 (but in mirror reflection). This explains weak correlation toward female score.


In [23]:
combined[(combined['female_per'] > 60) & (combined['sat_score'] > 1700)]


Out[23]:
DBN SCHOOL NAME Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score sat_score SchoolName AP Test Takers Total Exams Taken ... priority05 priority06 priority07 priority08 priority09 priority10 Location 1 lat lon school_dist
5 01M696 BARD HIGH SCHOOL EARLY COLLEGE 130 624.0 604.0 628.0 1856.0 0 129.028846 197.038462 ... 0 0 0 0 0 0 525 East Houston Street\nNew York, NY 10002\n(... 40.718962 -73.976066 01
26 02M416 ELEANOR ROOSEVELT HIGH SCHOOL 127 572.0 594.0 592.0 1758.0 Eleanor Roosevelt High School 155.000000 235.000000 ... 0 0 0 0 0 0 411 East 76 Street\nNew York, NY 10021\n(40.77... 40.770116 -73.953379 02
60 03M479 BEACON HIGH SCHOOL 261 577.0 575.0 592.0 1744.0 BEACON SCHOOL 166.000000 197.000000 ... 0 0 0 0 0 0 227-243 West 61St Street\nNew York, NY 10023\n... 40.772158 -73.987797 03
61 03M485 FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A... 531 566.0 564.0 577.0 1707.0 FIORELLO H.LAGUARDIA HS 691.000000 1236.000000 ... 0 0 0 0 0 0 100 Amsterdam Avenue\nNew York, NY 10023\n(40.... 40.773671 -73.985269 03
302 25Q525 TOWNSEND HARRIS HIGH SCHOOL 278 621.0 651.0 638.0 1910.0 TOWNSEND HARRIS HS 613.000000 796.000000 ... 0 0 0 0 0 0 149-11 Melbourne Avenue\nFlushing, NY 11367\n(... 40.734408 -73.821417 25

5 rows × 160 columns

These schools appears to be very selective liberal arts schools that have high academic standards.


In [24]:
combined['ap_per'] = combined['AP Test Takers '] / combined['total_enrollment']

In [ ]: